/******************************************************************************
Author: Akshay Dixit
Date Created: November 20, 2018
Date last modified: June 19, 2020
Project: T4D Tanzania
Purpose: Adjusting intermediate outcome p-values for multiple hypotheses testing
******************************************************************************/

clear
set more off

* Working folder should be the Tanzania data folder
cd "$data"	

import exc using "pvalues_intermediate_outcomes.xlsx", first

/*
	
	This dataset contains a vector of p-values for all 100+ intermediate outcomes.
	These 100+ intermediate outcomes draw on two different data sources: Facility survey, and Household survey.
	
	This. do file adjusts the p-values for multiple hypotheses testing by data source.
	Accordingly, it is divided into two halves, the former for facility and the latter for household outcomes.
	For each half, it displays the number of hypotheses that are rejected (alpha=0.05), and the distribution of q-values.
	
	It generates sharpened FDR two-stage q-values, as calculated in Dupas, Huillery & Seban (2018),
	"Risk information, risk salience, and adolescent sexual behavior: Experimental evidence from Cameroon." 
	Journal of Economic Behavior & Organization, 145, 151-175
	
	FDR two-stage q-values were introduced in Benjamini, Krieger & Yekutieli (2006), 
	"Adaptive Linear Step-up Procedures that Control the False Discovery Rate", 
	Biometrika, 93(3), 491-507
	
*/

******************************************************************************

*** INTERMEDIATE OUTCOMES FROM THE FACILITY SURVEY ***

keep if dataset == "facility"

* Collect the total number of p-values tested

qui sum pval
local totalpvals = r(N)

* Sort the p-values in ascending order and generate a variable that codes each p-value's rank

qui g int original_sorting_order = _n 
qui sort pval
qui g int rank = _n if pval~=.

* Set the initial counter to 1 

local qval = 1

* Generate the variable that will contain the BKY (2006) sharpened q-values

qui g bky06_qval = 1 if pval~=.

* Set up a loop that begins by checking which hypotheses are rejected at q = 1.000, then checks which hypotheses are rejected at q = 0.999, then checks which hypotheses are rejected at q = 0.998, etc.  The loop ends by checking which hypotheses are rejected at q = 0.001.

while `qval' > 0 {
	* First Stage
	* Generate the adjusted first stage q level we are testing: q' = q/1+q
	local qval_adj = `qval'/(1+`qval')
	* Generate value q'*r/M
	g fdr_temp1 = `qval_adj'*rank/`totalpvals'
	* Generate binary variable checking condition p(r) <= q'*r/M
	g reject_temp1 = (fdr_temp1>=pval) if pval~=.
	* Generate variable containing p-value ranks for all p-values that meet above condition
	g reject_rank1 = reject_temp1*rank
	* Record the rank of the largest p-value that meets above condition
	egen total_rejected1 = max(reject_rank1)

	* Second Stage
	* Generate the second stage q level that accounts for hypotheses rejected in first stage: q_2st = q'*(M/m0)
	local qval_2st = `qval_adj'*(`totalpvals'/(`totalpvals'-total_rejected1[1]))
	* Generate value q_2st*r/M
	g fdr_temp2 = `qval_2st'*rank/`totalpvals'
	* Generate binary variable checking condition p(r) <= q_2st*r/M
	g reject_temp2 = (fdr_temp2>=pval) if pval~=.
	* Generate variable containing p-value ranks for all p-values that meet above condition
	g reject_rank2 = reject_temp2*rank
	* Record the rank of the largest p-value that meets above condition
	egen total_rejected2 = max(reject_rank2)

	* A p-value has been rejected at level q if its rank is less than or equal to the rank of the max p-value that meets the above condition
	replace bky06_qval = `qval' if rank <= total_rejected2 & rank~=.
	* Reduce q by 0.001 and repeat loop
	drop fdr_temp* reject_temp* reject_rank* total_rejected*
	local qval = `qval' - .001

}	

qui sort original_sorting_order
qui count if bky06_qval < 0.05
local significant = r(N)

di "************************************************************************************"
di "INTERMEDIATE OUTCOMES: T4D TANZANIA FACILITY SURVEY"
di "A total of `totalpvals' hypotheses tested"
di "Based on the q-values, a total of `significant' hypotheses rejected at the 95% level"
di "Benjamini Krieger Yekutieli (2006) sharpened q-vals are in variable 'bky06_qval'"
di "The distribution of q-values is as follows:"
tab bky06_qval
di "************************************************************************************"

******************************************************************************

*** INTERMEDIATE OUTCOMES FROM THE HOUSEHOLD SURVEY ***

clear

import exc using "pvalues_intermediate_outcomes.xlsx", first
keep if dataset == "household"

* Collect the total number of p-values tested

qui sum pval
local totalpvals = r(N)

* Sort the p-values in ascending order and generate a variable that codes each p-value's rank

qui g int original_sorting_order = _n 
qui sort pval
qui g int rank = _n if pval~=.

* Set the initial counter to 1 

local qval = 1

* Generate the variable that will contain the BKY (2006) sharpened q-values

qui g bky06_qval = 1 if pval~=.

* Set up a loop that begins by checking which hypotheses are rejected at q = 1.000, then checks which hypotheses are rejected at q = 0.999, then checks which hypotheses are rejected at q = 0.998, etc.  The loop ends by checking which hypotheses are rejected at q = 0.001.

while `qval' > 0 {
	* First Stage
	* Generate the adjusted first stage q level we are testing: q' = q/1+q
	local qval_adj = `qval'/(1+`qval')
	* Generate value q'*r/M
	g fdr_temp1 = `qval_adj'*rank/`totalpvals'
	* Generate binary variable checking condition p(r) <= q'*r/M
	g reject_temp1 = (fdr_temp1>=pval) if pval~=.
	* Generate variable containing p-value ranks for all p-values that meet above condition
	g reject_rank1 = reject_temp1*rank
	* Record the rank of the largest p-value that meets above condition
	egen total_rejected1 = max(reject_rank1)

	* Second Stage
	* Generate the second stage q level that accounts for hypotheses rejected in first stage: q_2st = q'*(M/m0)
	local qval_2st = `qval_adj'*(`totalpvals'/(`totalpvals'-total_rejected1[1]))
	* Generate value q_2st*r/M
	g fdr_temp2 = `qval_2st'*rank/`totalpvals'
	* Generate binary variable checking condition p(r) <= q_2st*r/M
	g reject_temp2 = (fdr_temp2>=pval) if pval~=.
	* Generate variable containing p-value ranks for all p-values that meet above condition
	g reject_rank2 = reject_temp2*rank
	* Record the rank of the largest p-value that meets above condition
	egen total_rejected2 = max(reject_rank2)

	* A p-value has been rejected at level q if its rank is less than or equal to the rank of the max p-value that meets the above condition
	replace bky06_qval = `qval' if rank <= total_rejected2 & rank~=.
	* Reduce q by 0.001 and repeat loop
	drop fdr_temp* reject_temp* reject_rank* total_rejected*
	local qval = `qval' - .001

}	

qui sort original_sorting_order
qui count if bky06_qval < 0.05
local significant = r(N)

di "************************************************************************************"
di "INTERMEDIATE OUTCOMES: T4D TANZANIA HOUSEHOLD SURVEY"
di "A total of `totalpvals' hypotheses tested"
di "Based on the q-values, a total of `significant' hypotheses rejected at the 95% level"
di "Benjamini Krieger Yekutieli (2006) sharpened q-vals are in variable 'bky06_qval'"
di "The distribution of q-values is as follows:"
tab bky06_qval
di "************************************************************************************"
	
******************************************************************************

clear
